High coverage data

Select high coverage cells, filter out the genes that do not have at least 10 counts in at least 10 cells.

data("fluidigm")
fluidigm_high <- fluidigm[,which(colData(fluidigm)$Coverage_Type=="High")]
filter <- apply(assay(fluidigm_high)>10, 1, sum)>=10

Number of retained genes:

print(sum(filter))
## [1] 6998

No normalization

Fit a zinb model on the high coverage data (no normalization).

fluidigm_high <- fluidigm_high[filter,]
high <- assay(fluidigm_high)
set.seed(23424)
print(system.time(zinb_high <- zinbFit(high, ncores = 3, K = 2)))
##    user  system elapsed 
## 670.571  72.622 296.439

Plot the results with cells colored according to their biological condition.

bio <- as.factor(colData(fluidigm_high)$Biological_Condition)
cl <- as.factor(colData(fluidigm_high)$Cluster2)

plot(zinb_high@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_high@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

Compared to PCA (of the log counts)

pca <- prcomp(t(log1p(high)))
plot(pca$x, col=cols[bio], pch=19)
legend("topleft", levels(bio), fill=cols)

plot(pca$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)

Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.

#first factor and total number of detected genes in the cell
cor(zinb_high@W[,1], colSums(high>0),method="spearman")
## [1] -0.2168706
#second factor and the total number of detected genes in the cell
cor(zinb_high@W[,2], colSums(high>0),method="spearman")
## [1] 0.5244318
#first factor and the total number of counts in the cell
cor(zinb_high@W[,1], colSums(high),method="spearman")
## [1] -0.1518794
#second factor and the total number of counts in the cell
cor(zinb_high@W[,2], colSums(high),method="spearman")
## [1] 0.3468531

Same for PCA

#first factor and total number of detected genes in the cell
cor(pca$x[,1], colSums(high>0), method="spearman")
## [1] 0.9232517
#second factor and the total number of detected genes in the cell
cor(pca$x[,2], colSums(high>0), method="spearman")
## [1] 0.297028
#first factor and the total number of counts in the cell
cor(pca$x[,1], colSums(high), method="spearman")
## [1] 0.1270105
#second factor and the total number of counts in the cell
cor(pca$x[,2], colSums(high), method="spearman")
## [1] -0.3141171

Correlation between PCA and ZINB

cor(pca$x[,1], zinb_high@W[,1], method="spearman")
## [1] -0.2783654
cor(pca$x[,2], zinb_high@W[,1], method="spearman")
## [1] -0.01188811
cor(pca$x[,1], zinb_high@W[,2], method="spearman")
## [1] 0.7553322
cor(pca$x[,2], zinb_high@W[,2], method="spearman")
## [1] -0.4728147

Using scran size factors

sce <- newSCESet(countData=data.frame(high))
sce <- computeSumFactors(sce, sizes=c(10, 20, 30))

sf <- sizeFactors(sce)

norm <- exprs(convertTo(sce, type="monocle"))
fq <- betweenLaneNormalization(high, which="full")

plotRLE(high, outline=FALSE, col=cols[bio], main="Unnormalized counts")

plotRLE(norm, outline=FALSE, col=cols[bio], main="SCRAN normalization")

plotRLE(fq, outline=FALSE, col=cols[bio], main="FQ normalization")

I’m not convinced that this is the best normalization, but since it’s the only one specifically proposed for scRNA-seq, it’s a good place to start.

set.seed(3525)
offsets <- matrix(rep(sf, NROW(high)), ncol=NROW(high), nrow=NCOL(high))
print(system.time(zinb_norm <- zinbFit(high, ncores = 3, K = 2, O_mu = offsets)))
##    user  system elapsed 
## 748.390  80.517 318.133

Plot the results with cells colored according to their biological condition.

plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

qc <- as.matrix(colData(fluidigm_high)[,metadata(fluidigm_high)$which_qc])
qcpca <- prcomp(qc, scale=TRUE)

quality <- qcpca$x[,1]
detection_rate <- colSums(high>0)

data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()

Compared to PCA (of the log counts)

pca_norm <- prcomp(t(log1p(norm)))
plot(pca_norm$x, col=cols[bio], pch=19)
legend("bottomleft", levels(bio), fill=cols)

plot(pca_norm$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)

Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.

#first factor and total number of detected genes in the cell
cor(zinb_norm@W[,1], colSums(high>0),method="spearman")
## [1] -0.2240385
#second factor and the total number of detected genes in the cell
cor(zinb_norm@W[,2], colSums(high>0),method="spearman")
## [1] 0.4842657
#first factor and the total number of counts in the cell
cor(zinb_norm@W[,1], colSums(high),method="spearman")
## [1] -0.1604021
#second factor and the total number of counts in the cell
cor(zinb_norm@W[,2], colSums(high),method="spearman")
## [1] 0.3326923

Same for PCA

#first factor and total number of detected genes in the cell
cor(pca_norm$x[,1], colSums(high>0), method="spearman")
## [1] 0.9566434
#second factor and the total number of detected genes in the cell
cor(pca_norm$x[,2], colSums(high>0), method="spearman")
## [1] -0.1056818
#first factor and the total number of counts in the cell
cor(pca_norm$x[,1], colSums(high), method="spearman")
## [1] -0.07137238
#second factor and the total number of counts in the cell
cor(pca_norm$x[,2], colSums(high), method="spearman")
## [1] 0.4917832

Correlation between PCA and ZINB

cor(pca_norm$x[,1], zinb_norm@W[,1], method="spearman")
## [1] -0.2262675
cor(pca_norm$x[,2], zinb_norm@W[,1], method="spearman")
## [1] 0.004195804
cor(pca_norm$x[,1], zinb_norm@W[,2], method="spearman")
## [1] 0.6608392
cor(pca_norm$x[,2], zinb_norm@W[,2], method="spearman")
## [1] 0.6752622

Using FQ derived offsets

se <- newSeqExpressionSet(high)
se <- betweenLaneNormalization(se, which="full", offset=TRUE)
offsets <- t(offst(se))
set.seed(35235)
print(system.time(zinb_fq <- zinbFit(high, ncores = 3, K = 2, O_mu = offsets)))
##    user  system elapsed 
## 841.972  87.240 353.576

Plot the results with cells colored according to their biological condition.

plot(zinb_fq@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_fq@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

data.frame(W1=zinb_fq@W[,1], W2=zinb_fq@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()

Compared to PCA (of the log counts)

pca_fq <- prcomp(t(log1p(fq)))
plot(pca_fq$x, col=cols[bio], pch=19)
legend("topright", levels(bio), fill=cols)

plot(pca_fq$x, col=cols2[cl], pch=19)
legend("topright", levels(cl), fill=cols2)

Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.

#first factor and total number of detected genes in the cell
cor(zinb_fq@W[,1], colSums(high>0),method="spearman")
## [1] 0.08378497
#second factor and the total number of detected genes in the cell
cor(zinb_fq@W[,2], colSums(high>0),method="spearman")
## [1] 0.3754808
#first factor and the total number of counts in the cell
cor(zinb_fq@W[,1], colSums(high),method="spearman")
## [1] -0.1039336
#second factor and the total number of counts in the cell
cor(zinb_fq@W[,2], colSums(high),method="spearman")
## [1] 0.3515297

Same for PCA

#first factor and total number of detected genes in the cell
cor(pca_fq$x[,1], colSums(high>0), method="spearman")
## [1] 0.7178322
#second factor and the total number of detected genes in the cell
cor(pca_fq$x[,2], colSums(high>0), method="spearman")
## [1] 0.6338724
#first factor and the total number of counts in the cell
cor(pca_fq$x[,1], colSums(high), method="spearman")
## [1] 0.2144231
#second factor and the total number of counts in the cell
cor(pca_fq$x[,2], colSums(high), method="spearman")
## [1] -0.2588287

Correlation between PCA and ZINB

cor(pca_fq$x[,1], zinb_fq@W[,1], method="spearman")
## [1] -0.002097902
cor(pca_fq$x[,2], zinb_fq@W[,1], method="spearman")
## [1] 0.1466783
cor(pca_fq$x[,1], zinb_fq@W[,2], method="spearman")
## [1] 0.7437937
cor(pca_fq$x[,2], zinb_fq@W[,2], method="spearman")
## [1] -0.1988199

Low coverage data

Select low coverage cells, filter out the genes that do not have at least 10 counts in at least 10 cells.

fluidigm_low <- fluidigm[,which(colData(fluidigm)$Coverage_Type=="Low")]
filter <- apply(assay(fluidigm_low)>10, 1, sum)>=10

Number of retained genes:

print(sum(filter))
## [1] 6998

No normalization

Fit a zinb model on the low coverage data (no normalization).

fluidigm_low <- fluidigm_low[filter,]
low <- assay(fluidigm_low)
set.seed(654)
print(system.time(zinb_low <- zinbFit(low, ncores = 3, K = 2)))
##    user  system elapsed 
## 379.503  60.009 163.815

Plot the results with cells colored according to their biological condition.

bio <- as.factor(colData(fluidigm_low)$Biological_Condition)
cl <- as.factor(colData(fluidigm_low)$Cluster2)

plot(zinb_low@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_low@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

Compared to PCA (of the log counts)

pca <- prcomp(t(log1p(low)))
plot(pca$x, col=cols[bio], pch=19)
legend("topleft", levels(bio), fill=cols)

plot(pca$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)

Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.

#first factor and total number of detected genes in the cell
cor(zinb_low@W[,1], colSums(low>0),method="spearman")
## [1] 0.4277894
#second factor and the total number of detected genes in the cell
cor(zinb_low@W[,2], colSums(low>0),method="spearman")
## [1] -0.5360534
#first factor and the total number of counts in the cell
cor(zinb_low@W[,1], colSums(low),method="spearman")
## [1] 0.0005681818
#second factor and the total number of counts in the cell
cor(zinb_low@W[,2], colSums(low),method="spearman")
## [1] 0.2904283

Same for PCA

#first factor and total number of detected genes in the cell
cor(pca$x[,1], colSums(low>0), method="spearman")
## [1] -0.6268781
#second factor and the total number of detected genes in the cell
cor(pca$x[,2], colSums(low>0), method="spearman")
## [1] -0.03942437
#first factor and the total number of counts in the cell
cor(pca$x[,1], colSums(low), method="spearman")
## [1] 0.06979895
#second factor and the total number of counts in the cell
cor(pca$x[,2], colSums(low), method="spearman")
## [1] -0.474257

Correlation between PCA and ZINB

cor(pca$x[,1], zinb_low@W[,1], method="spearman")
## [1] -0.8020979
cor(pca$x[,2], zinb_low@W[,1], method="spearman")
## [1] -0.06770105
cor(pca$x[,1], zinb_low@W[,2], method="spearman")
## [1] 0.8125874
cor(pca$x[,2], zinb_low@W[,2], method="spearman")
## [1] -0.4075612

Using scran size factors

sce <- newSCESet(countData=data.frame(low))
sce <- computeSumFactors(sce, sizes=c(10, 20, 30))

sf <- sizeFactors(sce)

norm <- exprs(convertTo(sce, type="monocle"))

plotRLE(low, outline=FALSE, col=cols[bio], main="Unnormalized counts")

plotRLE(norm, outline=FALSE, col=cols[bio], main="SCRAN normalization")

set.seed(3525)
offsets <- matrix(rep(sf, NROW(low)), ncol=NROW(low), nrow=NCOL(low))
print(system.time(zinb_norm <- zinbFit(low, ncores = 3, K = 2, O_mu = offsets)))
##    user  system elapsed 
## 266.052  45.152 117.459

Plot the results with cells colored according to their biological condition.

plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

qc <- as.matrix(colData(fluidigm_low)[,metadata(fluidigm_low)$which_qc])
qcpca <- prcomp(qc, scale=TRUE)

quality <- qcpca$x[,1]
detection_rate <- colSums(low>0)

data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()

Compared to PCA (of the log counts)

pca_norm <- prcomp(t(log1p(norm)))
plot(pca_norm$x, col=cols[bio], pch=19)
legend("bottomleft", levels(bio), fill=cols)

plot(pca_norm$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)

Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.

#first factor and total number of detected genes in the cell
cor(zinb_norm@W[,1], colSums(low>0),method="spearman")
## [1] 0.02917491
#second factor and the total number of detected genes in the cell
cor(zinb_norm@W[,2], colSums(low>0),method="spearman")
## [1] -0.5416043
#first factor and the total number of counts in the cell
cor(zinb_norm@W[,1], colSums(low),method="spearman")
## [1] -0.03199301
#second factor and the total number of counts in the cell
cor(zinb_norm@W[,2], colSums(low),method="spearman")
## [1] 0.1593969

Same for PCA

#first factor and total number of detected genes in the cell
cor(pca_norm$x[,1], colSums(low>0), method="spearman")
## [1] -0.8312772
#second factor and the total number of detected genes in the cell
cor(pca_norm$x[,2], colSums(low>0), method="spearman")
## [1] 0.5694024
#first factor and the total number of counts in the cell
cor(pca_norm$x[,1], colSums(low), method="spearman")
## [1] 0.4491259
#second factor and the total number of counts in the cell
cor(pca_norm$x[,2], colSums(low), method="spearman")
## [1] -0.5332168

Correlation between PCA and ZINB

cor(pca_norm$x[,1], zinb_norm@W[,1], method="spearman")
## [1] 0.2093969
cor(pca_norm$x[,2], zinb_norm@W[,1], method="spearman")
## [1] 0.4050699
cor(pca_norm$x[,1], zinb_norm@W[,2], method="spearman")
## [1] 0.859965
cor(pca_norm$x[,2], zinb_norm@W[,2], method="spearman")
## [1] 0.1836538

Include detection rate as covariates

x <- model.matrix(~scale(detection_rate))
set.seed(9948)
print(system.time(zinb_norm <- zinbFit(low, ncores = 3, K = 2, X = x, which_X_mu=1:2, which_X_pi=1L)))
##    user  system elapsed 
## 381.644  47.689 170.539
plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()

Todo list

  • Add gc-content and length as gene-level covariates in pi regression
  • If somehow the first column of W identifies outlying samples, we could perhaps see if the second and third components are a better projection. [This doesn’t seem to work; for some reason, the last factor W is always OK, but the firts n-1 are always just picking a few cells apart.]
  • What’s special with those outliers? [Detection rate?]
  • It seems that there is also a good amount of randomness in W (i.e., running zinbFit multiple times seems to create very different projections).